{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
    "\n",
    "data = np.load('/home/hll/IDS/2020/data/select/zuizhong/data_train.npy')\n",
    "label= np.load('/home/hll/IDS/2020/data/select/zuizhong/label6_train.npy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  SMOTE all minority classes to (number of data set samples / classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "X=np.array(data)  \n",
    "b=np.array(label)\n",
    "bb=b.reshape(b.shape[0],)    \n",
    "y10 = np.int32(bb)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3167779, 12)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 2443895), (1, 200334), (2, 359), (3, 458010), (4, 65144), (5, 37)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(Counter(y10).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 2443895), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]\n",
      "time: 2295.455605983734\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "guo = 527963  #Oversampling samples\n",
    "\n",
    "smo = SMOTE(ratio={1:guo,2:guo,3:guo,4:guo,5:guo},random_state=42)\n",
    "\n",
    "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
    "print(sorted(Counter(y_smo).items()))\n",
    "\n",
    "time_end = time.time()\n",
    "time = time_end - time_start\n",
    "print(\"time:\",time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5083710"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_smo.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Extract Majority class of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "list0 = []  #data-0\n",
    "list1 = []  #other classes data\n",
    "list2 = [] #other classes label\n",
    "\n",
    "\n",
    "for i in range(X_smo.shape[0]):\n",
    "    if y_smo[i] == 0:\n",
    "        list0.append(X_smo[i])\n",
    "    else:\n",
    "        list1.append(X_smo[i])\n",
    "        list2.append(y_smo[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Normal class data shape： (2443895, 12)\n",
      "1-5 class data shape： (2639815, 12)\n",
      "1-5 class data shape： (2639815,)\n"
     ]
    }
   ],
   "source": [
    "data0 = np.array(list0)  \n",
    "data1 = np.array(list1)\n",
    "label2 = np.array(list2)\n",
    "# label22 = label2.reshape(label2.shape[0],)  \n",
    "\n",
    "print(\"Normal class data shape：\",data0.shape)\n",
    "print(\"1-5 class data shape：\",data1.shape)  \n",
    "print(\"1-5 class data shape：\",label2.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# # Cluster majority data into  C (total number of classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time: 22.22838044166565\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "estimator = KMeans(n_clusters=6)\n",
    "estimator.fit(data0) \n",
    "\n",
    "time_end = time.time()\n",
    "time = time_end - time_start\n",
    "print(\"time:\",time)\n",
    "\n",
    "label_pred_0 = estimator.labels_ "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 687683), (1, 258667), (2, 718137), (3, 110457), (4, 117957), (5, 550994)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(Counter(label_pred_0).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pred = label_pred_0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# # Select a certain amount of data from each cluster to form a new majority data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "c0 = []\n",
    "c1 = []\n",
    "c2 = []\n",
    "c3 = []\n",
    "c4 = []\n",
    "c5 = []\n",
    "\n",
    "s0=s1=s2=s3=s4=s5=0\n",
    "\n",
    "for i in range(data0.shape[0]):\n",
    "    if label_pred[i] == 0:\n",
    "        c0.append(data0[i])\n",
    "        s0=s0+1\n",
    "    elif label_pred[i] == 1:\n",
    "        c1.append(data0[i])\n",
    "        s1=s1+1\n",
    "    elif label_pred[i] == 2:\n",
    "        c2.append(data0[i])\n",
    "        s2=s2+1\n",
    "    elif label_pred[i] == 3:\n",
    "        c3.append(data0[i])\n",
    "        s3=s3+1\n",
    "    elif label_pred[i] == 4:\n",
    "        c4.append(data0[i])\n",
    "        s4=s4+1\n",
    "    elif label_pred[i] == 5:\n",
    "        c5.append(data0[i])\n",
    "        s5=s5+1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "a=87993"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "del c0[a:len(c0)]\n",
    "del c1[a:len(c1)]\n",
    "del c2[a:len(c2)]\n",
    "del c3[a:len(c3)]\n",
    "del c4[a:len(c4)]\n",
    "del c5[a:len(c5)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "c00 = np.array(c0)\n",
    "c11 = np.array(c1)\n",
    "c22 = np.array(c2)\n",
    "c33 = np.array(c3)\n",
    "c44 = np.array(c4)\n",
    "c55 = np.array(c5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(527958, 12)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q0 = np.concatenate((c00,c11,c22,c33,c44,c55),axis=0)\n",
    "q0.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(527958,)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_zc = np.zeros((q0.shape[0],), dtype=int)\n",
    "label_zc.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_end = np.concatenate((q0,data1),axis=0)\n",
    "label_end = np.concatenate((label_zc,label2),axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 527958), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(Counter(label_end).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_end = label_end.reshape(label_end.shape[0],1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#save dataset\n",
    "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_data_train.npy\",data_end)\n",
    "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_label_train.npy\",label_end)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
